Take home Exercise 3 - 4 Feb 2022

Animation of graphs in R

Frostbear https://sg.linkedin.com/in/farahfoo (SMU Masters in IT business (Fintech and Analytics))https://scis.smu.edu.sg/master-it-business
2022-02-02

Context of Exercise

Using previous age-sex pyramid based on 2021 data, to apply appropriate interactivity and animation methods to design an age-sex pyramid based data visualisation to show the changes of demographic structure of Singapore by age cohort and gender between 2000-2020 at planning area level. The data set used is entitle Singapore Residents by Planning Area / Subzone, Age Group, Sex and Type of Dwelling, June 2000-2010 and Singapore Residents by Planning Area / Subzone, Age Group, Sex and Type of Dwelling, June 2011-2020, from Department of Statistics home page.

Installing and loading packages required for Age-Sex pyramid

packages = c('tidyverse', 'readxl', 'ggthemes')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Loading data using _csv command

pop_data <- read_csv("data/respopagesextod2021.csv")
glimpse (data)
function (..., list = character(), package = NULL, lib.loc = NULL, 
    verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)  

Adding Factor levels to AG field

To sort the age-sex pyramid using Age Group, we need to classify the AG field as Factor

pop_data$AG <- factor(pop_data$AG, levels = unique(pop_data$AG))

Summarising the data into the required buckets

summary_sex <- pop_data %>%
  group_by(AG, Sex) %>%
  summarise(Pop = sum(Pop)) %>%
  ungroup()

head (summary_sex,5)
# A tibble: 5 x 3
  AG       Sex        Pop
  <fct>    <chr>    <dbl>
1 0_to_4   Females  87730
2 0_to_4   Males    91400
3 5_to_9   Females  97120
4 5_to_9   Males   102390
5 10_to_14 Females  97980

Plotting double geom_bar Age-sex pyramid

ggplot(summary_sex, aes(x=AG)) +
  geom_bar(data=summary_sex[summary_sex$Sex=="Males",], aes(y=Pop*-1), stat="identity", fill="blue") +
  geom_bar(data=summary_sex[summary_sex$Sex=="Females",], aes(y=Pop), stat="identity", fill="pink") +
  geom_hline(yintercept=0, colour="white", lwd=1)+
coord_flip () +
scale_y_continuous(breaks = seq(-160000,160000,40000), labels = function(v) ifelse(abs(v)>=1000,paste0(abs(v)/1000, "K"), abs(v))) +
  labs(y="Population", x="Gender") +
  ggtitle("                        Male                                                Female")

Building base graph for 20 years of population data

For animation of population across time, data source can be found here at singstat website.

year2000 <- read_csv("data/respopagesextod2000to2010.csv")
year2011 <- read_csv("data/respopagesextod2011to2020.csv") 

head (year2000,3)
# A tibble: 3 x 7
  PA         SZ        AG     Sex   TOD                      Pop  Time
  <chr>      <chr>     <chr>  <chr> <chr>                  <dbl> <dbl>
1 Ang Mo Kio Cheng San 0_to_4 Males HDB 1- and 2-Room Fla~    20  2000
2 Ang Mo Kio Cheng San 0_to_4 Males HDB 3-Room Flats         480  2000
3 Ang Mo Kio Cheng San 0_to_4 Males HDB 4-Room Flats         220  2000
head (year2011,3)
# A tibble: 3 x 7
  PA         SZ                     AG     Sex   TOD         Pop  Time
  <chr>      <chr>                  <chr>  <chr> <chr>     <dbl> <dbl>
1 Ang Mo Kio Ang Mo Kio Town Centre 0_to_4 Males HDB 1- a~     0  2011
2 Ang Mo Kio Ang Mo Kio Town Centre 0_to_4 Males HDB 3-Ro~    10  2011
3 Ang Mo Kio Ang Mo Kio Town Centre 0_to_4 Males HDB 4-Ro~    30  2011
# Since columns are the same, we can combine the 2 files into 1 file for processing

combined <- rbind(year2000,year2011)
unique(combined$Time)
 [1] 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
[14] 2013 2014 2015 2016 2017 2018 2019 2020
# in the Time column, there are only numbers, hence the row header was not copied into the data

Adding Factor levels to Age group field

To sort the age-sex pyramid using Age Group, we need to classify the AG field as Factor

combined$AG <- factor(combined$AG, levels = unique(combined$AG))

Summarising data by Age Group, Sex and Time

To plot the graph over the different years, we need to call out the Time field as a column (variable)

summary_sex_20 <- combined %>%
  group_by(AG, Sex, Time) %>%
  summarise(Pop = sum(Pop)) %>%
  ungroup()

head (summary_sex_20,5)
# A tibble: 5 x 4
  AG     Sex      Time    Pop
  <fct>  <chr>   <dbl>  <dbl>
1 0_to_4 Females  2000 108850
2 0_to_4 Females  2001 107510
3 0_to_4 Females  2002 105310
4 0_to_4 Females  2003 101430
5 0_to_4 Females  2004  99290

Plotting double geom_bar Age-sex pyramid for 20 years

Using the individual Age-sex pyramid from above (plotted for year 2021), we re-use the code to plot out 20 pyramid graphs, 1 graph for each year.

ggplot(summary_sex_20, aes(x=AG)) +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Males",], aes(y=Pop*-1), stat="identity", fill="blue") +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Females",], aes(y=Pop), stat="identity", fill="pink") +
  geom_hline(yintercept=0, colour="white", lwd=1)+
  
coord_flip () +
  
scale_y_continuous(breaks = seq(-160000,160000,40000), labels = function(v) ifelse(abs(v)>=1000,paste0(abs(v)/1000, "K"), abs(v))) +
  
  labs(title = "Age-Sex Population Pyramid, Singapore 2021", 
   caption = 'Data Source: Department of Statistics (June 2021)',
   y = "Population", x = "Gender") + 
  
  theme_bw() +
   theme(legend.position = "none")+
  theme(plot.title = element_text(size=16))+
  theme(plot.subtitle = element_text(size=12))+
  
facet_wrap(. ~ `Time`,ncol=4)

It is clear from the 20 graphs displayed, that the difference in population year on year is not clear. To show more clarity, we use the year as base to transition the graph in 1 frame in the next section.

Using gganimate

but first, we enhance the graph by

adding title caption theme find out the maximum and minimum values of the population to set the chart axis to ensure all the values will be captured properly.

Activating gganimate as it will be used for the animation of the age-sex pyramid over the 20 years

We call out the package required which is ggaminate.

packages = c('gganimate')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Then we find out the max and min values of the population set.

max(summary_sex_20$Pop)
[1] 164060
min(summary_sex_20$Pop)
[1] 1380

Improving the existing code by adding the range limits, title, subtitle and theme.

SG20 <- ggplot(summary_sex_20, aes(x=AG,colour=Sex,fill=Sex)) +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Males",], aes(y=Pop*-1), stat="identity") +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Females",], aes(y=Pop), stat="identity") +
  geom_hline(yintercept=0, colour="white", lwd=1) +
  
coord_flip() +
  
scale_y_continuous(limits = c(-170000, 170000), n.breaks = 10, labels = function(v) ifelse(abs(v)>= 1000,paste0(abs(v)/1000, "K"), abs(v))) +
  
  labs(title = "Singapore Age-Sex Population Pyramid for 20 years",
    subtitle = 'Year: "{round(frame_time, 0)}"',
    caption = 'Data Source: Department of Statistics (June 2000 to June 2020)',
  y = 'Male and Female Population',
  x = 'Age Group') +
  
  theme_bw () +
   theme(legend.text = element_text(size=12))+
  theme(plot.title = element_text(size=16))+
  theme(plot.subtitle = element_text(size=10))

SG20

Animating the age-sex pyramid

SG20 +
transition_time(Time) +
ease_aes('linear')

Interactive plots

Loading packages for interactive plots

packages = c('tidyverse', 'readxl', 'ggthemes', 'ggiraph', 'plotly', 
             'gganimate', 'patchwork', 'DT', 'gifski', 'gapminder')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Creating new data set for interactive plots

interactive_data <- combined %>%
  spread (Sex, sum(Pop)) %>% 
  mutate(Total = Females + Males) %>% 
  group_by(Time, AG) %>% 
  summarise(Female = sum(Females), Male =sum(Males), Total = sum(Total))

head (interactive_data,5)
# A tibble: 5 x 5
# Groups:   Time [1]
   Time AG       Female   Male  Total
  <dbl> <fct>     <dbl>  <dbl>  <dbl>
1  2000 0_to_4   108850 117000 225850
2  2000 5_to_9   124620 132320 256940
3  2000 10_to_14 113730 121780 235510
4  2000 15_to_19 102320 109090 211410
5  2000 20_to_24 106230 106320 212550
p1 <- ggplot (data= interactive_data,
        aes(x = Time, 
            y = Total)) + 
  geom_point(dotsize = 1) 
  
  p1

p2 <- ggplot (data= interactive_data,
        aes(x = Female,
            y = Male,
            colour = AG)) + 
  geom_point(size = 1)

p2

Putting 2 graphs side by side

d <- highlight_key(interactive_data)

p1 <- ggplot (data= d,
        aes(x = Time, y = Total)) + 
  geom_col()


p2 <- ggplot (data= d,
        aes(x = Female,
            y = Male,
            colour = AG)) + 
  geom_point(size = 1)

subplot (ggplotly (p1),
         ggplotly (p2))

Linking graph with data table using crosstalk

p1 <- ggplot (data= d,
        aes(x = Female,
            y = Male,
            colour = AG)) + 
  geom_point(size = 1)

gg <- highlight(ggplotly(p1),
                "plotly_selected")

crosstalk::bscols(gg,
                  DT::datatable(d),
                  widths = 5)